setwd("C:/Workshop/Data")
raw <- read.csv("Titanic.csv")
head(raw)
## pclass survived name sex
## 1 1 1 Allen, Miss. Elisabeth Walton female
## 2 1 1 Allison, Master. Hudson Trevor male
## 3 1 0 Allison, Miss. Helen Loraine female
## 4 1 0 Allison, Mr. Hudson Joshua Creighton male
## 5 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female
## 6 1 1 Anderson, Mr. Harry male
## age sibsp parch ticket fare cabin embarked boat body
## 1 29.0000 0 0 24160 211.3375 B5 S 2 NA
## 2 0.9167 1 2 113781 151.5500 C22 C26 S 11 NA
## 3 2.0000 1 2 113781 151.5500 C22 C26 S NA
## 4 30.0000 1 2 113781 151.5500 C22 C26 S 135
## 5 25.0000 1 2 113781 151.5500 C22 C26 S NA
## 6 48.0000 0 0 19952 26.5500 E12 S 3 NA
## home.dest
## 1 St Louis, MO
## 2 Montreal, PQ / Chesterville, ON
## 3 Montreal, PQ / Chesterville, ON
## 4 Montreal, PQ / Chesterville, ON
## 5 Montreal, PQ / Chesterville, ON
## 6 New York, NY
summary(raw)
## pclass survived name
## Min. :1.000 Min. :0.000 Connolly, Miss. Kate : 2
## 1st Qu.:2.000 1st Qu.:0.000 Kelly, Mr. James : 2
## Median :3.000 Median :0.000 Abbing, Mr. Anthony : 1
## Mean :2.295 Mean :0.382 Abbott, Master. Eugene Joseph : 1
## 3rd Qu.:3.000 3rd Qu.:1.000 Abbott, Mr. Rossmore Edward : 1
## Max. :3.000 Max. :1.000 Abbott, Mrs. Stanton (Rosa Hunt): 1
## (Other) :1301
## sex age sibsp parch
## female:466 Min. : 0.1667 Min. :0.0000 Min. :0.000
## male :843 1st Qu.:21.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :28.0000 Median :0.0000 Median :0.000
## Mean :29.8811 Mean :0.4989 Mean :0.385
## 3rd Qu.:39.0000 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :80.0000 Max. :8.0000 Max. :9.000
## NA's :263
## ticket fare cabin embarked
## CA. 2343: 11 Min. : 0.000 :1014 : 2
## 1601 : 8 1st Qu.: 7.896 C23 C25 C27 : 6 C:270
## CA 2144 : 8 Median : 14.454 B57 B59 B63 B66: 5 Q:123
## 3101295 : 7 Mean : 33.295 G6 : 5 S:914
## 347077 : 7 3rd Qu.: 31.275 B96 B98 : 4
## 347082 : 7 Max. :512.329 C22 C26 : 4
## (Other) :1261 NA's :1 (Other) : 271
## boat body home.dest
## :823 Min. : 1.0 :564
## 13 : 39 1st Qu.: 72.0 New York, NY : 64
## C : 38 Median :155.0 London : 14
## 15 : 37 Mean :160.8 Montreal, PQ : 10
## 14 : 33 3rd Qu.:256.0 Cornwall / Akron, OH: 9
## 4 : 31 Max. :328.0 Paris, France : 9
## (Other):308 NA's :1188 (Other) :639
plot(raw)
library(corrgram)
corrgram(raw)
sum(is.na(raw))
## [1] 1452
library(dplyr)
clean <- raw %>%
mutate(age = ifelse(is.na(age), mean(na.omit(age)), age)) %>%
mutate(family = sibsp + parch) %>%
mutate(survived = as.factor(ifelse(survived == 0, "No", "Yes"))) %>%
select(
Class = pclass,
Sex = sex,
Age = age,
Family = family,
Survived = survived) %>%
as.data.frame()
library(caret)
set.seed(42)
indexes <- createDataPartition(
clean$Survived,
p = .8,
list = FALSE,
times = 1)
train <- clean[indexes, ]
test <- clean[-indexes, ]
preProcess <- c("center", "scale")
control <- trainControl(
method = "cv",
number = 10)
knnModel <- train(
form = Survived ~ .,
data = train,
method = "knn",
preProcess = preProcess,
trControl = control,
tuneLength = 5,
metric = "Accuracy")
print(knnModel)
## k-Nearest Neighbors
##
## 1048 samples
## 4 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 943, 943, 943, 944, 943, 943, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8015842 0.5754371
## 7 0.7929762 0.5562429
## 9 0.8034890 0.5779054
## 11 0.8120788 0.5945939
## 13 0.8120696 0.5931258
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 11.
plot(knnModel)
treeModel <- train(
form = Survived ~ .,
data = train,
method = "rpart",
preProcess = preProcess,
trControl = control,
tuneLength = 5,
metric = "Accuracy")
print(treeModel)
## CART
##
## 1048 samples
## 4 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 943, 943, 943, 943, 943, 944, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.004375 0.8053388 0.5776352
## 0.005000 0.8072619 0.5811883
## 0.021250 0.8005861 0.5695910
## 0.023750 0.7871978 0.5433680
## 0.450000 0.6727473 0.1890019
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.005.
plot(treeModel)
neuralTuneGrid <- data.frame(
size = c(3, 4, 5, 3, 4, 5, 3, 4, 5),
decay = c(0.1, 0.1, 0.1, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001))
neuralModel <- train(
form = Survived ~ .,
data = train,
method = "nnet",
preProcess = preProcess,
trControl = control,
tuneGrid = neuralTuneGrid)
print(neuralModel)
## Neural Network
##
## 1048 samples
## 4 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 943, 943, 943, 943, 943, 943, ...
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 3 0.001 0.8053388 0.5789952
## 3 0.010 0.8139469 0.5966001
## 3 0.100 0.8158700 0.5994007
## 4 0.001 0.8129945 0.5935100
## 4 0.010 0.8168315 0.6029929
## 4 0.100 0.8158700 0.5968726
## 5 0.001 0.8196886 0.6092592
## 5 0.010 0.8129945 0.5953079
## 5 0.100 0.8177564 0.6022955
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 5 and decay = 0.001.
4 Plot model accuracy
plot(neuralModel)
results <- resamples(list(
knn = knnModel,
tree = treeModel,
nnet = neuralModel))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: knn, tree, nnet
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## knn 0.7428571 0.8023810 0.8142857 0.8120788 0.8281593 0.8557692 0
## tree 0.7238095 0.7852335 0.8229853 0.8072619 0.8357143 0.8476190 0
## nnet 0.7714286 0.8081502 0.8238095 0.8196886 0.8380952 0.8653846 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## knn 0.4521739 0.5805516 0.5956140 0.5945939 0.6252555 0.6849758 0
## tree 0.3817259 0.5313906 0.6191558 0.5811883 0.6373072 0.6673267 0
## nnet 0.5058824 0.5790588 0.6090769 0.6092592 0.6465084 0.7156250 0
dotplot(results)
bwplot(results)
densityplot(results, auto.key = TRUE)
final_predictions <- predict(
object = treeModel,
newdata = test)
finalMatrix <- confusionMatrix(
data = final_predictions,
reference = test$Survived)
print(finalMatrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 134 30
## Yes 27 70
##
## Accuracy : 0.7816
## 95% CI : (0.7265, 0.8302)
## No Information Rate : 0.6169
## P-Value [Acc > NIR] : 9.348e-09
##
## Kappa : 0.5353
## Mcnemar's Test P-Value : 0.7911
##
## Sensitivity : 0.8323
## Specificity : 0.7000
## Pos Pred Value : 0.8171
## Neg Pred Value : 0.7216
## Prevalence : 0.6169
## Detection Rate : 0.5134
## Detection Prevalence : 0.6284
## Balanced Accuracy : 0.7661
##
## 'Positive' Class : No
##
predict(
object = neuralModel,
newdata = data.frame(
Class = 3,
Sex = "male",
Age = 20,
Family = 1),
type = "prob")
## No Yes
## 1 0.8598103 0.1401897
Question: Would you bet on Jack surviving?
Determine the likelihood that you would survive the Titanic.
Question: Would you take that bet?